library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Data Source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6960825/
Mohino-Herranz I, Gil-Pita R, Rosa-Zurera M, Seoane F. Activity Recognition Using Wearable Physiological Measurements: Selection of Features from a Comprehensive Literature Study. Sensors (Basel). 2019 Dec 13;19(24):0. doi: 10.3390/s19245524. PMID: 31847261; PMCID: PMC6960825.
Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")
rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
rep_ID[ind] <- ctr
if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
tb <- table(Activitydata$class)
classes <- c("Neu","Emo","Men","Phy")
names(classes) <- names(tb)
Activitydata$class <- classes[as.character(Activitydata$class)]
table(Activitydata$class)
#>
#> Emo Men Neu Phy
#> 1120 1120 1120 1120
ID_class <- paste(Activitydata$ID,Activitydata$class)
IDCLASS <- unique(ID_class)
theclass <- Activitydata$class[!duplicated(ID_class)]
theIDs <- Activitydata$ID[!duplicated(ID_class)]
ActivitydataAvg <- NULL
for (id in IDCLASS)
{
ActivitydataAvg <- rbind(ActivitydataAvg,apply(Activitydata[ID_class==id,featNames],2,mean))
}
colnames(ActivitydataAvg) <- featNames
rownames(ActivitydataAvg) <- IDCLASS
ActivitydataAvg <- as.data.frame(ActivitydataAvg)
ActivitydataAvg$class <- theclass
ActivitydataAvg$ID <- theIDs
table(ActivitydataAvg$class)
#>
#> Emo Men Neu Phy
#> 40 40 40 40
ActivitydataAvg <- subset(ActivitydataAvg, class=="Men" | class=="Emo")
ActivitydataAvg$class <- 1*(ActivitydataAvg$class == "Men")
table(ActivitydataAvg$class)
#>
#> 0 1
#> 40 40
studyName <- "Activity"
dataframe <- ActivitydataAvg
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 80 | 534 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 40 | 40 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1000
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
1
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 362 , Uni p: 0.01628202 , Uncorrelated Base: 62 , Outcome-Driven Size: 0 , Base Size: 62
#>
#>
1 <R=1.000,w= 1,N= 265>, Top: 47( 21 )[ 1 : 47 Fa= 47 : 0.975 ]( 47 , 183 , 0 ),<|>Tot Used: 230 , Added: 183 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,w= 1,N= 265>, Top: 28( 6 )[ 1 : 28 Fa= 74 : 0.975 ]( 27 , 94 , 47 ),<|>Tot Used: 258 , Added: 94 , Zero Std: 0 , Max Cor: 1.000
#>
3 <R=1.000,w= 1,N= 265>, Top: 10( 2 )[ 1 : 10 Fa= 84 : 0.975 ]( 10 , 20 , 74 ),<|>Tot Used: 258 , Added: 20 , Zero Std: 0 , Max Cor: 1.000
#>
4 <R=1.000,w= 2,N= 92>, Top: 37( 1 )[ 1 : 37 Fa= 99 : 0.950 ]( 37 , 45 , 84 ),<|>Tot Used: 280 , Added: 45 , Zero Std: 0 , Max Cor: 0.992
#>
5 <R=0.992,w= 2,N= 92>, Top: 10( 1 )[ 1 : 10 Fa= 100 : 0.946 ]( 10 , 10 , 99 ),<|>Tot Used: 280 , Added: 10 , Zero Std: 0 , Max Cor: 0.946
#>
6 <R=0.946,w= 2,N= 92>, Top: 19( 1 )[ 1 : 19 Fa= 107 : 0.923 ]( 19 , 20 , 100 ),<|>Tot Used: 289 , Added: 20 , Zero Std: 0 , Max Cor: 0.983
#>
7 <R=0.983,w= 2,N= 92>, Top: 1( 1 )[ 1 : 1 Fa= 107 : 0.941 ]( 1 , 1 , 107 ),<|>Tot Used: 289 , Added: 1 , Zero Std: 0 , Max Cor: 0.930
#>
8 <R=0.930,w= 3,N= 85>, Top: 31( 2 )[ 1 : 31 Fa= 118 : 0.865 ]( 31 , 38 , 107 ),<|>Tot Used: 303 , Added: 38 , Zero Std: 0 , Max Cor: 0.983
#>
9 <R=0.983,w= 3,N= 85>, Top: 10( 1 )[ 1 : 10 Fa= 119 : 0.892 ]( 8 , 10 , 118 ),<|>Tot Used: 307 , Added: 10 , Zero Std: 0 , Max Cor: 0.954
#>
10 <R=0.954,w= 3,N= 85>, Top: 2( 1 )[ 1 : 2 Fa= 121 : 0.877 ]( 2 , 2 , 119 ),<|>Tot Used: 307 , Added: 2 , Zero Std: 0 , Max Cor: 0.963
#>
11 <R=0.963,w= 4,N= 62>, Top: 29( 2 )[ 1 : 29 Fa= 131 : 0.832 ]( 27 , 30 , 121 ),<|>Tot Used: 312 , Added: 30 , Zero Std: 0 , Max Cor: 0.891
#>
12 <R=0.891,w= 4,N= 62>, Top: 25( 1 )[ 1 : 25 Fa= 136 : 0.800 ]( 24 , 27 , 131 ),<|>Tot Used: 317 , Added: 27 , Zero Std: 0 , Max Cor: 0.988
#>
13 <R=0.988,w= 4,N= 62>, Top: 2( 1 )[ 1 : 2 Fa= 138 : 0.844 ]( 2 , 2 , 136 ),<|>Tot Used: 317 , Added: 2 , Zero Std: 0 , Max Cor: 0.981
#>
14 <R=0.981,w= 5,N= 6>, Top: 3( 1 )[ 1 : 3 Fa= 139 : 0.800 ]( 3 , 3 , 138 ),<|>Tot Used: 317 , Added: 3 , Zero Std: 0 , Max Cor: 0.804
#>
15 <R=0.804,w= 5,N= 6>, Top: 1( 1 )[ 1 : 1 Fa= 140 : 0.800 ]( 1 , 1 , 139 ),<|>Tot Used: 317 , Added: 1 , Zero Std: 0 , Max Cor: 0.798
#>
16 <R=0.798,w= 6,N= 0>
#>
[ 16 ], 0.7983933 Decor Dimension: 317 . Cor to Base: 195 , ABase: 31 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
616
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
158
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.84
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
3.12
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9976649
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : ECG_p_LF_mean 200 : IT_CCV_LF 300 : EDA_Original_mad_D
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : La_ECG_p_LF_mean 200 : La_IT_CCV_LF 300 : La_EDA_Original_mad_D
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| ECG_hrv_prctile75 | 0.483 | 1.031 | -0.1471 | 1.286 | 0.0849 | 0.731 |
| ECG_hrv_geomean_A_ | -0.105 | 1.208 | 0.5327 | 1.093 | 0.1001 | 0.727 |
| IT_LF_baseline_D | 0.518 | 1.044 | -0.2219 | 0.700 | 0.5845 | 0.721 |
| IT_p_Total_baseline | 0.507 | 1.009 | -0.2168 | 0.668 | 0.6797 | 0.721 |
| IT_VLF_baseline | 0.492 | 0.990 | -0.2217 | 0.652 | 0.7285 | 0.720 |
| ECG_hrv_prctile25 | 0.244 | 1.150 | -0.4451 | 0.938 | 0.5416 | 0.719 |
| IT_PSD_baseline | 0.554 | 1.095 | -0.1829 | 0.773 | 0.2875 | 0.715 |
| ECG_hrv_mean | 0.259 | 0.865 | -0.3517 | 0.931 | 0.2559 | 0.714 |
| IT_HF_baseline | 0.692 | 1.254 | -0.0466 | 1.052 | 0.0555 | 0.713 |
| ECG_hrv_trimmean25 | 0.282 | 0.950 | -0.3372 | 0.967 | 0.5083 | 0.711 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| La_IT_BRV_baseline | -0.15629 | 0.4237 | 0.1964 | 0.2851 | 0.9649 | 0.759 |
| **La_EDA_processed_geomean_A__D** | 0.29051 | 0.5653 | -0.2026 | 0.4549 | 0.7895 | 0.742 |
| La_EDA_Filt1_mad_D | -0.01455 | 0.0533 | 0.0233 | 0.0324 | 0.3238 | 0.733 |
| La_ECG_HR_min_div_baseline | 0.06680 | 0.4682 | 0.2872 | 0.4039 | 0.0950 | 0.731 |
| La_ECG_RR_window_baseline | 0.14303 | 0.2844 | -0.0680 | 0.2889 | 0.9594 | 0.729 |
| La_EDA_processed_std_D | 0.24649 | 0.3816 | -0.0880 | 0.3304 | 0.1748 | 0.724 |
| IT_p_Total_baseline | 0.50669 | 1.0086 | -0.2168 | 0.6684 | 0.6797 | 0.721 |
| La_EDA_Original_max_D | 0.00529 | 0.0520 | -0.0230 | 0.0302 | 0.0128 | 0.716 |
| ECG_hrv_mean | 0.25862 | 0.8655 | -0.3517 | 0.9306 | 0.2559 | 0.714 |
| La_EDA_processed_trimmean25_D | 0.21468 | 0.6566 | -0.1355 | 0.3130 | 0.4613 | 0.710 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.85 | 281 | 0.77 |
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| La_IT_BRV_baseline | -0.832IT_BRV_prctile25 + 1.000IT_BRV_baseline | -0.15629 | 0.4237 | 0.19644 | 0.2851 | 0.96495 | 0.759 | 0.642 | -1 |
| **La_EDA_processed_geomean_A__D** | + 1.000*EDA_processed_geomean_A__D -0.977*EDA_Filt2_std_D | 0.29051 | 0.5653 | -0.20256 | 0.4549 | 0.78945 | 0.742 | 0.601 | 2 |
| La_EDA_Filt1_mad_D | + 1.000EDA_Filt1_mad_D -1.026EDA_Filt2_std_D | -0.01455 | 0.0533 | 0.02326 | 0.0324 | 0.32376 | 0.733 | 0.508 | 0 |
| La_ECG_HR_min_div_baseline | + 0.912ECG_RR_window_geomean_A_ + 1.000ECG_HR_min_div_baseline | 0.06680 | 0.4682 | 0.28718 | 0.4039 | 0.09500 | 0.731 | 0.518 | -1 |
| La_ECG_RR_window_baseline | -0.914ECG_RR_window_geomean_A_ + 1.000ECG_RR_window_baseline | 0.14303 | 0.2844 | -0.06795 | 0.2889 | 0.95939 | 0.729 | 0.522 | -1 |
| La_EDA_processed_std_D | + 1.000EDA_processed_std_D -0.890EDA_Filt2_std_D | 0.24649 | 0.3816 | -0.08800 | 0.3304 | 0.17475 | 0.724 | 0.581 | 1 |
| IT_p_Total_baseline | 0.50669 | 1.0086 | -0.21676 | 0.6684 | 0.67972 | 0.721 | 0.721 | 5 | |
| La_EDA_Original_max_D | + 1.000EDA_Original_max_D -1.797EDA_Original_prctile75_D + 0.805*EDA_Filt1_prctile25_D | 0.00529 | 0.0520 | -0.02299 | 0.0302 | 0.01285 | 0.716 | 0.575 | -2 |
| ECG_hrv_mean | 0.25862 | 0.8655 | -0.35169 | 0.9306 | 0.25586 | 0.714 | 0.714 | 4 | |
| La_EDA_processed_trimmean25_D | + 1.000EDA_processed_trimmean25_D -0.932EDA_processed_median_D | 0.21468 | 0.6566 | -0.13552 | 0.3130 | 0.46134 | 0.710 | 0.582 | -1 |
| IT_BRV_baseline | NA | -0.35971 | 0.8468 | -0.00106 | 1.0856 | 0.08274 | 0.642 | 0.642 | NA |
| ECG_RR_window_geomean_A_ | NA | -0.09206 | 0.9615 | 0.20263 | 0.9615 | 0.99561 | 0.603 | 0.603 | 15 |
| **EDA_processed_geomean_A__D** | NA | 0.68903 | 1.3691 | 0.21447 | 1.0251 | 0.05106 | 0.601 | 0.601 | NA |
| EDA_processed_trimmean25_D | NA | -0.39525 | 1.3811 | -0.61752 | 1.3648 | 0.01157 | 0.582 | 0.582 | NA |
| EDA_processed_std_D | NA | 0.60948 | 1.1881 | 0.29184 | 0.9118 | 0.01488 | 0.581 | 0.581 | NA |
| EDA_Original_max_D | NA | 0.63696 | 1.3434 | 0.36718 | 1.1892 | 0.00726 | 0.575 | 0.575 | NA |
| EDA_Filt1_prctile25_D | NA | 0.66089 | 1.4082 | 0.37552 | 1.2296 | 0.00966 | 0.573 | 0.573 | 34 |
| EDA_Original_prctile75_D | NA | 0.64742 | 1.3754 | 0.38526 | 1.2143 | 0.01041 | 0.570 | 0.570 | NA |
| EDA_processed_median_D | NA | -0.65445 | 1.3990 | -0.51718 | 1.3120 | 0.00378 | 0.530 | 0.530 | NA |
| ECG_RR_window_baseline | NA | 0.05886 | 0.9529 | 0.11732 | 0.9276 | 0.77168 | 0.522 | 0.522 | NA |
| ECG_HR_min_div_baseline | NA | 0.15072 | 1.0047 | 0.10247 | 0.9849 | 0.67448 | 0.518 | 0.518 | NA |
| EDA_Filt1_mad_D | NA | 0.40391 | 1.0951 | 0.46113 | 1.1993 | 0.01682 | 0.508 | 0.508 | NA |
| IT_BRV_prctile25 | NA | -0.24439 | 1.0856 | -0.23727 | 1.0882 | 0.27464 | 0.503 | 0.503 | 5 |
| EDA_Filt2_std_D | NA | 0.40783 | 1.0812 | 0.42676 | 1.1548 | 0.02041 | 0.489 | 0.489 | 11 |